## Programmer: Vito D'Orazio
## select_sequences_111914.r

## inputs: events_GTDS_old.dta
## outputs: select_sequences_111914.RData

## This program begins by reading as input the merged CAMEO-coded event data and the 5 GTDS conflict variables.
## For each of the TRUE potential peace observations, a single draw is taken from a Bernoulli distribution
## with the probability of drawing a 1 equal to the chance object.
## The chance object is assigned a value equal to the number of TRUE conflict onsets divided by the number of
## TRUE potential peace observations multiplied by set.peaces, a number to be chosen by the user.
## For each TRUE potential peace observation where a 1 is drawn from the Bernoulli, that observation along with
## the temporally preceding 12 observations are kept for the next step of the sequence analysis.


rm(list=ls())
library(foreign)
library(zoo)

setwd("/Users/vjdorazio/Desktop/Sequence_Analysis_paper/replication_data")

mydata <- read.dta("events_GTDS_old.dta")
#mydata <- read.dta("events_GTDS.dta")

## SECTION 2.a ## SECTION 2.a ## SECTION 2.a ## SECTION 2.a ##
##############################################################
## ONSET AND POTENTIAL PEACE #### ONSET AND POTENTIAL PEACE ##

# The following section codes 10 new vectors; 2 for each GTDS variable.
# Every GTDS variable is coded as an onset, a potential peace or left at 0.

gtds.reb <- mydata$gtdsrebellion
reb.potpeace <- vector(mode = "integer", length = nrow(mydata))
reb.onset <- vector(mode = "integer", length = nrow(mydata))

gtds.ins <- mydata$gtdsinsurgency
ins.potpeace <- vector(length = nrow(mydata))
ins.onset <- vector(length = nrow(mydata))

gtds.dom <- mydata$gtdscrisisdomestic
dom.potpeace <- vector(length = nrow(mydata))
dom.onset <- vector(length = nrow(mydata))

gtds.vio <- mydata$gtdsviolence
vio.potpeace <- vector(length = nrow(mydata))
vio.onset <- vector(length = nrow(mydata))

gtds.int <- mydata$gtdsinternationalcrisis
int.potpeace <- vector(length = nrow(mydata))
int.onset <- vector(length = nrow(mydata))

reb.count <- 0
ins.count <- 0
dom.count <- 0
vio.count <- 0
int.count <- 0

for(i in 1:nrow(mydata)) {

    # resetting all counts when we index to a new country
    if(i>1 && mydata$COUNTRY[i] != mydata$COUNTRY[i-1]) {
        reb.count <- 0
        ins.count <- 0
        dom.count <- 0
        vio.count <- 0
        int.count <- 0
        next
    }

    # rebellion
    if (gtds.reb[i] == 0) { # no GTDS conflict
        reb.count <- reb.count+1
        if(reb.count > 12) {
            reb.potpeace[i] <- 1
        }
    }
    if (gtds.reb[i] == 1) { # yes GTDS conflict
        if(reb.count >= 12) {
            reb.onset[i] <- 1
        }
        reb.count <- 0
    }

    # insurgency
    if (gtds.ins[i] == 0) { # no GTDS conflict
        ins.count <- ins.count+1
        if(ins.count > 12) {
            ins.potpeace[i] <- 1
        }
    }
    if (gtds.ins[i] == 1) { # yes GTDS conflict
        if(ins.count >= 12) {
            ins.onset[i] <- 1
        }
        ins.count <- 0
    }

    # domestic crisis
    if (gtds.dom[i] == 0) { # no GTDS conflict
        dom.count <- dom.count+1
        if(dom.count > 12) {
            dom.potpeace[i] <- 1
        }
    }
    if (gtds.dom[i] == 1) { # yes GTDS conflict
        if(dom.count >= 12) {
            dom.onset[i] <- 1
        }
        dom.count <- 0
    }

    # violence
    if (gtds.vio[i] == 0) { # no GTDS conflict
        vio.count <- vio.count+1
        if(vio.count > 12) {
            vio.potpeace[i] <- 1
        }
    }
    if (gtds.vio[i] == 1) { # yes GTDS conflict
        if(vio.count >= 12) {
            vio.onset[i] <- 1
        }
        vio.count <- 0
    }

    # international crisis
    if (gtds.int[i] == 0) { # no GTDS conflict
        int.count <- int.count+1
        if(int.count > 12) {
            int.potpeace[i] <- 1
        }
    }
    if (gtds.int[i] == 1) { # yes GTDS conflict
        if(int.count >= 12) {
            int.onset[i] <- 1
        }
        int.count <- 0
    }
} # end for loop

mydata <- cbind(mydata,reb.onset,reb.potpeace,ins.onset,ins.potpeace,dom.onset,dom.potpeace,vio.onset,vio.potpeace,int.onset,int.potpeace)


## SECTION 2.b ## SECTION 2.b ## SECTION 2.b #####
##################################################
## SELECTING SEQUENCES #### SELECTING SEQUENCES ##

# Subsetting out all states that never experience conflict

# properly sorting the data
mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

## The following section of code subsets mydata to only include the states
## that experience at least one observation of conflict onset
keep.state <- vector(mode="character",length = length(unique(mydata$COUNTRY)))
temp.data <- matrix(data=NA, nrow=0,ncol=ncol(mydata))

# int.onset is not included because in our study we are forecasting domestic violence
keep.data <- mydata[which(mydata$reb.onset == 1 | mydata$ins.onset == 1
              | mydata$dom.onset == 1 | mydata$vio.onset == 1),]
keep.data$COUNTRY <- as.vector(keep.data$COUNTRY)
keep.state <- unique(keep.data$COUNTRY)
keep.state <- as.vector(keep.state)
mydata$COUNTRY <- as.vector(mydata$COUNTRY)
for (j in 1:length(keep.state)) {
  temp.data <- rbind(temp.data,subset(mydata, mydata$COUNTRY == keep.state[j]))
}

# reestablish mydata and sorting
mydata <- temp.data[order(temp.data$COUNTRY,temp.data$date_str),]

## The following section of code selects the potential peace sequences to be kept

set.seed(4422)
set.peaces <- 2 # multiplied by the number of onsets for the total number of peace sequences to be kept

# int.onset is not included because in our study we are forecasting domestic violence
sum.onset <- sum(mydata$reb.onset, mydata$ins.onset, mydata$dom.onset,
                   mydata$vio.onset, na.rm=TRUE)

# Important theoretical decision was made here to ONLY select observations
# where we observe potential peace sequences across ALL conflict variables
# This can be thought of as the GTDS version of a "total peace"

total.peace <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros
total.peace[mydata$reb.potpeace==1 & mydata$ins.potpeace==1 & mydata$dom.potpeace==1
                   & mydata$vio.potpeace==1] <- 1
sum.peace <- sum(total.peace, na.rm=TRUE)
chance <- set.peaces * sum.onset / sum.peace

mydata <- cbind(mydata,total.peace)

##############################################################################################################

keep.peace <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros

# a single draw from a binomial distribution of size 1... a Bernoulli...
# with the probability of drawing a 1 equal to chance

for(k in 1:nrow(mydata)) {
  if(mydata$total.peace[k] == 0) next
  keep.peace[k] <- rbinom(n=1,size=1,prob=chance)
}
mydata <- cbind(mydata,keep.peace)

mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

# These are the observations that end the 13-week sequences
keep.obs <- vector(mode="numeric",length=nrow(mydata)) # a vector of zeros
keep.obs[mydata$keep.peace==1] <- 1
keep.obs[mydata$reb.onset==1] <- 1
keep.obs[mydata$ins.onset==1] <- 1
keep.obs[mydata$dom.onset==1] <- 1
keep.obs[mydata$vio.onset==1] <- 1

mydata <- cbind(mydata,keep.obs)

mydata$date_str[which(nchar(mydata$date_str)==6)] <- paste(substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 1,5), "0", substr(mydata$date_str[which(nchar(mydata$date_str)==6)], 6,6), sep="")

mydata <- mydata[order(mydata$COUNTRY,mydata$date_str),]

mydata$month[which(mydata$month=="")] <- NA
mydata$ctryname[which(mydata$ctryname=="")] <- NA

mydata$month <- na.locf(mydata$month)
mydata$ctryname <- na.locf(mydata$ctryname)

mydata$Other_Crisis <- 0

## SECTION 2.c ## SECTION 2.c ## SECTION 2.c ## SECTION 2.c #############
#########################################################################
## ESTABLISHING SEQUENCES  ESTABLISHING SEQUENCES  ESTABLISHING SEQUENCES

## This following uses 3d arrays to form our sequences.
## Picture a draw with a stack of papers.  Each piece of paper is an 12x13 matrix.
## Each 12x13 matrix corresponds to a single sequence-worth of data.
## We have 12 variables and 13 weeks.

# Establish two separate 3-dimensional arrays.
# One is comrpised of the t-12 to t-1 week sequences preceding a true keep.obs.
# The other is comprised of the t-17 to t-6 week sequences preceding a true keep.obs.

sum.seq <- sum(mydata$keep.obs)
history <- array(0,dim=c(12,13,sum.seq))
lag.history <- array(0,dim=c(11,13,sum.seq)) # some of the higher number matrices will be empty e.g. lag.history[,,230]
count.seq <- 0
count.lagseq <- 0 # vjd 10/16/10
temp.counts <- vector(mode="numeric",length=4)
temp.week <- "1998w01"
temp.state <- "USA"
temp.crises <- 0


# Nested for loops that fill in our history array
# history[,,1] contains the data of one sequence at index 1
for(i in 1:nrow(mydata)) {
    cat(i, " \n")
  if(isTRUE(mydata$keep.obs[i]==1)) { # if the observation is either an onset OR a selected peace
    temp.type.seq <- 0
    if(isTRUE(mydata$keep.peace[i]==1)) { # the observation is a selected peace
        temp.type.seq <- 1
    }
    
    # coding other crises
    if(mydata$month[i]=="1998m1") {
        temp.crises <- 0
    } else {
        temp.ctry <- mydata$ctryname[i]
        lastmonth <- as.numeric(substr(mydata$month[i], 6, 7)) - 1
        if(lastmonth==0) {
            lastyear <- as.numeric(substr(mydata$month[i], 1, 4)) - 1
            monthmatch <- paste(lastyear, "m12", sep="")
        } else {
            monthmatch <- paste(substr(mydata$month[i], 1, 4), "m", lastmonth, sep="")
        }
        cat(monthmatch, " \n")
        t <- mydata[which(mydata$ctryname==temp.ctry & mydata$month==monthmatch), c("gtdsrebellion", "gtdsinsurgency", "gtdscrisisdomestic", "gtdsviolence")]
        temp.crises <- sum(t[1,])
    }
    
    count.seq <- count.seq+1
    count.j <- 1
    if(i<12){next} #VJD: 8/12/14
    for(j in (i-12):i) {
      temp.counts <- as.vector(c(mydata$A_verb_coop[j],mydata$A_mater_coop[j],mydata$A_verb_confl[j],mydata$A_mater_confl[j],mydata$B_verb_coop[j],mydata$B_mater_coop[j],mydata$B_verb_confl[j],mydata$B_mater_confl[j]))
      temp.week <- as.character(mydata$date_str[j])
      temp.state <- as.character(mydata$COUNTRY[j])
      history[,count.j,count.seq] <- c(temp.week,temp.state,temp.counts,temp.type.seq, temp.crises)
      count.j <- count.j+1
    } # end for j

    # for lag.history and the 5-week lag in general it is necessary to deal with an out-of-bounds issue for
    # any time that total.peace equals 1 within the first 17 weeks - at week 18 we no longer have this problem
    # by definition for there to be an onset of conflict or peace, it must be preceded by 12 weeks of peace.
    # this means i-12 never overlaps with the previous country OR goes out-of-bounds for the first country.
    # BUT, i-17 will not only go out-of-bounds for the first country, it could contain observations from the previous country. vjd 10/16

    count.k <- 1

    # if the year is greater than 1998 OR the week is greater than 17
    if(isTRUE(as.numeric(substr(mydata$date_str[i],1,4))>1998) | isTRUE(as.numeric(substr(mydata$date_str[i],6,7))>17)) {
        count.lagseq <- count.lagseq + 1
        if(i<17){next} #VJD: 8/12/14
        for(k in (i-17):(i-5)) {
            temp.counts <- as.vector(c(mydata$A_verb_coop[k],mydata$A_mater_coop[k],mydata$A_verb_confl[k],mydata$A_mater_confl[k],mydata$B_verb_coop[k],mydata$B_mater_coop[k],mydata$B_verb_confl[k],mydata$B_mater_confl[k]))
            temp.week <- as.character(mydata$date_str[k])
            temp.state <- as.character(mydata$COUNTRY[k])
            lag.history[,count.k,count.lagseq] <- c(temp.week,temp.state,temp.counts,temp.type.seq)
            count.k <- count.k+1
        } # end for k
    } # end if year is greater than 1998 or week is greater than 17
  } # end if total.peace equals 1
} # end ext for

rownames(history) <- c("time","state","A_verb_coop","A_mater_coop","A_verb_confl","A_mater_confl","B_verb_coop","B_mater_coop","B_verb_confl","B_mater_confl","seq_type", "Other_Crises")
rownames(lag.history) <- c("time","state","A_verb_coop","A_mater_coop","A_verb_confl","A_mater_confl","B_verb_coop","B_mater_coop","B_verb_confl","B_mater_confl","seq_type")

save(list=ls(), file="select_sequences_111914.RData")


